library(tidyverse)
Read raw data.
mmetsp_raw_data <- read_tsv('../data/sample-attr.tab.txt')
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────
cols(
sample_id = col_double(),
sample_name = col_character(),
attr_type = col_character(),
attr_value = col_character()
)
mmetsp_raw_data
Unpack attributes.
mmetsp_wider <- mmetsp_raw_data %>%
rename(sample_name_main = sample_name) %>%
pivot_wider(id_cols = c('sample_id', 'sample_name_main'), names_from = "attr_type", values_from = "attr_value", names_repair = "unique")
Values are not uniquely identified; output will contain list-cols.
* Use `values_fn = list` to suppress this warning.
* Use `values_fn = length` to identify where the duplicates arise
* Use `values_fn = {summary_fun}` to summarise duplicates
mmetsp_wider
Select and unnest taxon info.
mmetsp_taxon <- mmetsp_wider %>%
select(sample_id, sample_name_main, taxon_id, phylum, class, order, genus, species, strain) %>%
unnest() %>%
mutate(
genus_species_strain = gsub(" ", "_", paste(genus, species, strain, sep = "_"))
)
`cols` is now required when using unnest().
Please use `cols = c(taxon_id, phylum, class, order, genus, species, strain)`
mmetsp_taxon
Select only barebones.
mmetsp_select <- mmetsp_taxon %>%
select('sample_id', 'sample_name_main', 'taxon_id', 'genus_species_strain')
colNames <- "assembly_accession, bioproject, biosample, wgs_master, refseq_category, taxid, species_taxid, organism_name, infraspecific_name, infraspecific_name2, isolateversion_status, assembly_level, release_type, genome_rep, seq_rel_date, asm_name, submitter, gbrs_paired_asm, paired_asm_comp, ftp_path, excluded_from_refseq, relation_to_type_material"
colNamesVec <- unlist(str_split(colNames, ", "))
genbank <- read_tsv('../data/assembly_summary_genbank.txt',
comment = "#",
col_names = colNamesVec) %>%
mutate(taxid = as.character(taxid),
species_taxid = as.character(species_taxid))
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
.default = col_character(),
taxid = col_double(),
species_taxid = col_double(),
seq_rel_date = col_date(format = "")
)
ℹ Use `spec()` for the full column specifications.
46385 parsing failures.
row col expected actual file
3 -- 22 columns 10 columns '../data/assembly_summary_genbank.txt'
12 -- 22 columns 10 columns '../data/assembly_summary_genbank.txt'
26 -- 22 columns 10 columns '../data/assembly_summary_genbank.txt'
95 -- 22 columns 9 columns '../data/assembly_summary_genbank.txt'
96 -- 22 columns 9 columns '../data/assembly_summary_genbank.txt'
... ... .......... .......... ......................................
See problems(...) for more details.
genbank_select <- genbank %>%
select('taxid', 'species_taxid', 'organism_name', 'genome_rep', 'ftp_path')
genbank_select
mmetsp_taxon
genbank_select <- genbank_select %>%
mutate(taxid = as.character(taxid),
species_taxid = as.character(species_taxid))
genbank_select